{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pytorch to Tensorflow Conversion Test Notebook\n",
    "\n",
    "To run this notebook follow these steps, modifying the **Config** section as necessary:\n",
    "\n",
    "1. Point `pt_model_dir` to your local directory containing the pytorch Bert model to be converted.\n",
    "2. Point `tf_bert_dir` to your clone of Google's Bert implementation which can be found here: https://github.com/google-research/bert.\n",
    "\n",
    "Note: \n",
    "1. This feature currently only supports the base BERT models (uncased/cased).\n",
    "2. Tensorflow model will be dumped in `tf_model_dir`."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "model_cls  = 'BertModel'\n",
    "model_typ  = 'bert-base-uncased'\n",
    "token_cls  = 'BertTokenizer'\n",
    "max_seq    = 12\n",
    "CLS        = \"[CLS]\"\n",
    "SEP        = \"[SEP]\"\n",
    "MASK       = \"[MASK]\"\n",
    "CLS_IDX    = 0\n",
    "layer_idxs = tuple(range(12))\n",
    "input_text = \"jim henson was a puppeteer\"\n",
    "\n",
    "pt_model_dir = \"/home/ubuntu/.pytorch-pretrained-BERT-cache/{}\".format(model_typ)\n",
    "tf_bert_dir  = \"/home/ubuntu/bert\"\n",
    "\n",
    "pt_vocab_file  = os.path.join(pt_model_dir, \"vocab.txt\")\n",
    "pt_init_ckpt   = os.path.join(pt_model_dir, model_typ.replace(\"-\", \"_\") + \".bin\")\n",
    "tf_model_dir   = os.path.join(pt_model_dir, 'tf')\n",
    "tf_vocab_file  = os.path.join(tf_model_dir, \"vocab.txt\")\n",
    "tf_init_ckpt   = os.path.join(tf_model_dir, model_typ.replace(\"-\", \"_\") + \".ckpt\")\n",
    "tf_config_file = os.path.join(tf_model_dir, \"bert_config.json\")\n",
    "\n",
    "if not os.path.isdir(tf_model_dir): \n",
    "    os.makedirs(tf_model_dir, exist_ok=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize(text, tokenizer):\n",
    "    text = text.strip().lower()\n",
    "    tok_ids = tokenizer.tokenize(text)\n",
    "    if len(tok_ids) > max_seq - 2:\n",
    "        tok_ids = tok_ids[:max_seq - 2]\n",
    "    tok_ids.insert(CLS_IDX, CLS)\n",
    "    tok_ids.append(SEP)\n",
    "    input_ids = tokenizer.convert_tokens_to_ids(tok_ids)\n",
    "    mask_ids = [1] * len(input_ids)\n",
    "    seg_ids = [0] * len(input_ids)\n",
    "    padding = [0] * (max_seq - len(input_ids))\n",
    "    input_ids += padding\n",
    "    mask_ids += padding\n",
    "    seg_ids += padding\n",
    "    return input_ids, mask_ids, seg_ids"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pytorch execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 231508/231508 [00:00<00:00, 41092464.26B/s]\n",
      "100%|██████████| 407873900/407873900 [00:07<00:00, 58092479.52B/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pytorch embedding shape: (1, 768)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import torch\n",
    "from pytorch_pretrained_bert import (BertConfig,\n",
    "                                     BertModel, \n",
    "                                     BertTokenizer, \n",
    "                                     BertForSequenceClassification)\n",
    "\n",
    "# Save Vocab\n",
    "pt_tokenizer = BertTokenizer.from_pretrained(\n",
    "    pretrained_model_name_or_path=model_typ, \n",
    "    cache_dir=pt_model_dir)\n",
    "pt_tokenizer.save_vocabulary(pt_model_dir)\n",
    "pt_tokenizer.save_vocabulary(tf_model_dir)\n",
    "\n",
    "# Save Model\n",
    "pt_model = BertModel.from_pretrained(\n",
    "    pretrained_model_name_or_path=model_typ, \n",
    "    cache_dir=pt_model_dir).to('cpu')\n",
    "pt_model.eval()\n",
    "pt_model.config.hidden_dropout_prob = 0.0\n",
    "pt_model.config.attention_probs_dropout_prob = 0.0\n",
    "pt_model.config.to_json_file(tf_config_file)\n",
    "torch.save(pt_model.state_dict(), pt_init_ckpt)\n",
    "\n",
    "# Inputs\n",
    "input_ids_pt, mask_ids_pt, seg_ids_pt = tokenize(input_text, pt_tokenizer)\n",
    "\n",
    "# PT Embedding\n",
    "tok_tensor = torch.tensor(input_ids_pt).to('cpu').unsqueeze(0)\n",
    "seg_tensor = torch.tensor(seg_ids_pt).to('cpu').unsqueeze(0)\n",
    "msk_tensor = torch.tensor(mask_ids_pt).to('cpu').unsqueeze(0)\n",
    "attn_blks, nsp_logits = pt_model(tok_tensor, seg_tensor, msk_tensor)\n",
    "pt_embedding = nsp_logits.detach().numpy() \n",
    "print(\"Pytorch embedding shape: {}\".format(pt_embedding.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Pytorch &rarr; Tensorflow conversion"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py:263: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Colocations handled automatically by placer.\n",
      "bert/embeddings/word_embeddings                             initialized\n",
      "bert/embeddings/position_embeddings                         initialized\n",
      "bert/embeddings/token_type_embeddings                       initialized\n",
      "bert/embeddings/LayerNorm/gamma                             initialized\n",
      "bert/embeddings/LayerNorm/beta                              initialized\n",
      "bert/encoder/layer_0/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_0/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_0/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_0/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_0/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_0/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_0/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_0/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_0/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_0/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_0/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_0/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_0/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_0/output/dense/bias                      initialized\n",
      "bert/encoder/layer_0/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_0/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_1/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_1/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_1/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_1/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_1/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_1/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_1/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_1/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_1/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_1/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_1/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_1/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_1/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_1/output/dense/bias                      initialized\n",
      "bert/encoder/layer_1/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_1/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_2/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_2/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_2/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_2/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_2/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_2/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_2/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_2/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_2/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_2/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_2/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_2/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_2/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_2/output/dense/bias                      initialized\n",
      "bert/encoder/layer_2/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_2/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_3/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_3/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_3/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_3/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_3/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_3/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_3/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_3/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_3/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_3/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_3/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_3/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_3/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_3/output/dense/bias                      initialized\n",
      "bert/encoder/layer_3/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_3/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_4/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_4/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_4/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_4/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_4/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_4/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_4/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_4/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_4/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_4/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_4/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_4/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_4/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_4/output/dense/bias                      initialized\n",
      "bert/encoder/layer_4/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_4/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_5/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_5/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_5/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_5/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_5/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_5/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_5/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_5/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_5/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_5/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_5/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_5/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_5/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_5/output/dense/bias                      initialized\n",
      "bert/encoder/layer_5/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_5/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_6/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_6/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_6/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_6/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_6/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_6/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_6/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_6/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_6/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_6/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_6/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_6/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_6/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_6/output/dense/bias                      initialized\n",
      "bert/encoder/layer_6/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_6/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_7/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_7/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_7/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_7/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_7/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_7/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_7/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_7/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_7/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_7/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_7/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_7/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_7/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_7/output/dense/bias                      initialized\n",
      "bert/encoder/layer_7/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_7/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_8/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_8/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_8/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_8/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_8/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_8/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_8/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_8/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_8/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_8/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_8/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_8/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_8/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_8/output/dense/bias                      initialized\n",
      "bert/encoder/layer_8/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_8/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_9/attention/self/query/kernel            initialized\n",
      "bert/encoder/layer_9/attention/self/query/bias              initialized\n",
      "bert/encoder/layer_9/attention/self/key/kernel              initialized\n",
      "bert/encoder/layer_9/attention/self/key/bias                initialized\n",
      "bert/encoder/layer_9/attention/self/value/kernel            initialized\n",
      "bert/encoder/layer_9/attention/self/value/bias              initialized\n",
      "bert/encoder/layer_9/attention/output/dense/kernel          initialized\n",
      "bert/encoder/layer_9/attention/output/dense/bias            initialized\n",
      "bert/encoder/layer_9/attention/output/LayerNorm/gamma       initialized\n",
      "bert/encoder/layer_9/attention/output/LayerNorm/beta        initialized\n",
      "bert/encoder/layer_9/intermediate/dense/kernel              initialized\n",
      "bert/encoder/layer_9/intermediate/dense/bias                initialized\n",
      "bert/encoder/layer_9/output/dense/kernel                    initialized\n",
      "bert/encoder/layer_9/output/dense/bias                      initialized\n",
      "bert/encoder/layer_9/output/LayerNorm/gamma                 initialized\n",
      "bert/encoder/layer_9/output/LayerNorm/beta                  initialized\n",
      "bert/encoder/layer_10/attention/self/query/kernel           initialized\n",
      "bert/encoder/layer_10/attention/self/query/bias             initialized\n",
      "bert/encoder/layer_10/attention/self/key/kernel             initialized\n",
      "bert/encoder/layer_10/attention/self/key/bias               initialized\n",
      "bert/encoder/layer_10/attention/self/value/kernel           initialized\n",
      "bert/encoder/layer_10/attention/self/value/bias             initialized\n",
      "bert/encoder/layer_10/attention/output/dense/kernel         initialized\n",
      "bert/encoder/layer_10/attention/output/dense/bias           initialized\n",
      "bert/encoder/layer_10/attention/output/LayerNorm/gamma      initialized\n",
      "bert/encoder/layer_10/attention/output/LayerNorm/beta       initialized\n",
      "bert/encoder/layer_10/intermediate/dense/kernel             initialized\n",
      "bert/encoder/layer_10/intermediate/dense/bias               initialized\n",
      "bert/encoder/layer_10/output/dense/kernel                   initialized\n",
      "bert/encoder/layer_10/output/dense/bias                     initialized\n",
      "bert/encoder/layer_10/output/LayerNorm/gamma                initialized\n",
      "bert/encoder/layer_10/output/LayerNorm/beta                 initialized\n",
      "bert/encoder/layer_11/attention/self/query/kernel           initialized\n",
      "bert/encoder/layer_11/attention/self/query/bias             initialized\n",
      "bert/encoder/layer_11/attention/self/key/kernel             initialized\n",
      "bert/encoder/layer_11/attention/self/key/bias               initialized\n",
      "bert/encoder/layer_11/attention/self/value/kernel           initialized\n",
      "bert/encoder/layer_11/attention/self/value/bias             initialized\n",
      "bert/encoder/layer_11/attention/output/dense/kernel         initialized\n",
      "bert/encoder/layer_11/attention/output/dense/bias           initialized\n",
      "bert/encoder/layer_11/attention/output/LayerNorm/gamma      initialized\n",
      "bert/encoder/layer_11/attention/output/LayerNorm/beta       initialized\n",
      "bert/encoder/layer_11/intermediate/dense/kernel             initialized\n",
      "bert/encoder/layer_11/intermediate/dense/bias               initialized\n",
      "bert/encoder/layer_11/output/dense/kernel                   initialized\n",
      "bert/encoder/layer_11/output/dense/bias                     initialized\n",
      "bert/encoder/layer_11/output/LayerNorm/gamma                initialized\n",
      "bert/encoder/layer_11/output/LayerNorm/beta                 initialized\n",
      "bert/pooler/dense/kernel                                    initialized\n",
      "bert/pooler/dense/bias                                      initialized\n"
     ]
    }
   ],
   "source": [
    "from pytorch_pretrained_bert.convert_pytorch_checkpoint_to_tf import main\n",
    "\n",
    "main([\n",
    "    '--model_name', model_typ, \n",
    "    '--pytorch_model_path', pt_init_ckpt,\n",
    "    '--tf_cache_dir', tf_model_dir,\n",
    "    '--cache_dir', pt_model_dir\n",
    "])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tensorflow execution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "WARNING: The TensorFlow contrib module will not be included in TensorFlow 2.0.\n",
      "For more information, please see:\n",
      "  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md\n",
      "  * https://github.com/tensorflow/addons\n",
      "If you depend on functionality not listed there, please file an issue.\n",
      "\n",
      "WARNING:tensorflow:From /home/ubuntu/bert/modeling.py:671: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use keras.layers.dense instead.\n",
      "WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/nlp/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1266: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.\n",
      "Instructions for updating:\n",
      "Use standard file APIs to check for files with this prefix.\n",
      "INFO:tensorflow:Restoring parameters from /home/ubuntu/.pytorch-pretrained-BERT-cache/bert-base-uncased/tf/bert_base_uncased.ckpt\n",
      "Tensorflow embedding shape: (1, 768)\n"
     ]
    }
   ],
   "source": [
    "import tensorflow as tf\n",
    "sys.path.insert(0, tf_bert_dir)\n",
    "import modeling\n",
    "import tokenization\n",
    "\n",
    "tf.reset_default_graph()\n",
    "\n",
    "# Process text\n",
    "tf_tokenizer = tokenization.FullTokenizer(vocab_file=tf_vocab_file)\n",
    "\n",
    "# Graph inputs\n",
    "input_ids_tf, mask_ids_tf, seg_ids_tf = tokenize(input_text, tf_tokenizer)\n",
    "config = modeling.BertConfig.from_json_file(\n",
    "    os.path.join(tf_model_dir, 'bert_config.json'))\n",
    "input_tensor = tf.placeholder(\n",
    "    dtype=tf.int32,\n",
    "    shape=[1, None],\n",
    "    name='input_ids')\n",
    "mask_tensor = tf.placeholder(\n",
    "    dtype=tf.int32,\n",
    "    shape=[1, None],\n",
    "    name='mask_ids')\n",
    "seg_tensor = tf.placeholder(\n",
    "    dtype=tf.int32,\n",
    "    shape=[1, None],\n",
    "    name='seg_ids')\n",
    "tf_model = modeling.BertModel(\n",
    "    config=config,\n",
    "    is_training=False,\n",
    "    input_ids=input_tensor,\n",
    "    input_mask=mask_tensor,\n",
    "    token_type_ids=seg_tensor,\n",
    "    use_one_hot_embeddings=False)\n",
    "output_layer = tf_model.get_pooled_output()\n",
    "\n",
    "# Load tf model\n",
    "session = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))\n",
    "vars_to_load = [v for v in tf.global_variables()]\n",
    "session.run(tf.variables_initializer(var_list=vars_to_load))\n",
    "saver = tf.train.Saver(vars_to_load)\n",
    "saver.restore(session, save_path=tf_init_ckpt)\n",
    "\n",
    "# TF Embedding\n",
    "fetches = output_layer\n",
    "feed_dict  = {\n",
    "    input_tensor: [input_ids_tf],\n",
    "    mask_tensor: [mask_ids_tf],\n",
    "    seg_tensor: [seg_ids_tf]\n",
    "}\n",
    "tf_embedding = session.run(fetches=fetches, feed_dict=feed_dict)\n",
    "print(\"Tensorflow embedding shape: {}\".format(tf_embedding.shape))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare Tokenization"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "TOKEN_IDS_PT: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
      "TOKEN_IDS_TF: [101, 3958, 27227, 2001, 1037, 13997, 11510, 102, 0, 0, 0, 0]\n",
      "SEG_IDS_PT:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "SEG_IDS_TF:   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
      "MASK_IDS_PT:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n",
      "MASK_IDS_TF:  [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]\n"
     ]
    }
   ],
   "source": [
    "print(\"TOKEN_IDS_PT: {}\".format(input_ids_pt))\n",
    "print(\"TOKEN_IDS_TF: {}\".format(input_ids_tf))\n",
    "print(\"SEG_IDS_PT:   {}\".format(seg_ids_pt))\n",
    "print(\"SEG_IDS_TF:   {}\".format(seg_ids_tf))\n",
    "print(\"MASK_IDS_PT:  {}\".format(mask_ids_pt))\n",
    "print(\"MASK_IDS_TF:  {}\".format(mask_ids_tf))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare Model Weights"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bert/embeddings/word_embeddings\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
      "TF: shape: (30522, 768) values: [-0.01018257 -0.06154883 -0.02649689 -0.0420608   0.00116716]\n",
      "\n",
      "bert/embeddings/token_type_embeddings\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
      "TF: shape: (2, 768) values: [0.00043164 0.01098826 0.00370439 0.00150542 0.00057812]\n",
      "\n",
      "bert/embeddings/position_embeddings\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
      "TF: shape: (512, 768) values: [ 0.01750538 -0.02563101 -0.03664156 -0.02528613  0.00797095]\n",
      "\n",
      "bert/embeddings/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
      "TF: shape: (768,) values: [-0.02591471 -0.0195513   0.02423946  0.08904593 -0.06281059]\n",
      "\n",
      "bert/embeddings/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
      "TF: shape: (768,) values: [0.9260566  0.8851115  0.85807985 0.8616906  0.8937205 ]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
      "TF: shape: (768, 768) values: [-0.01640572 -0.03257025  0.01046295 -0.04442816 -0.02256124]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
      "TF: shape: (768,) values: [ 0.58488506 -0.3312432  -0.43010172  0.37446147 -0.29811692]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
      "TF: shape: (768, 768) values: [ 0.00807745  0.02652155 -0.01866494  0.01797846  0.00450485]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
      "TF: shape: (768,) values: [ 0.00104306  0.00035106 -0.0024626  -0.00010567 -0.00119283]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
      "TF: shape: (768, 768) values: [ 0.01144261 -0.02663044  0.01911472 -0.02206182 -0.00287949]\n",
      "\n",
      "bert/encoder/layer_0/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
      "TF: shape: (768,) values: [-0.01184616 -0.01596605 -0.00251847  0.01736802  0.00449983]\n",
      "\n",
      "bert/encoder/layer_0/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
      "TF: shape: (768, 768) values: [ 0.00581949  0.03170148 -0.06135742 -0.01706108 -0.00759045]\n",
      "\n",
      "bert/encoder/layer_0/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
      "TF: shape: (768,) values: [ 0.00511063 -0.0166625   0.02812938 -0.01166061  0.01942627]\n",
      "\n",
      "bert/encoder/layer_0/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
      "TF: shape: (768,) values: [ 0.25779155 -0.03077853 -0.2772697  -0.38847703  0.36841765]\n",
      "\n",
      "bert/encoder/layer_0/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
      "TF: shape: (768,) values: [0.9803408  0.959969   0.96368986 0.9603653  0.9801324 ]\n",
      "\n",
      "bert/encoder/layer_0/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
      "TF: shape: (768, 3072) values: [-0.01010427 -0.060398   -0.01468864  0.00311493  0.02862451]\n",
      "\n",
      "bert/encoder/layer_0/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
      "TF: shape: (3072,) values: [-0.11498757 -0.09629171 -0.12399033 -0.129036   -0.06369043]\n",
      "\n",
      "bert/encoder/layer_0/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
      "TF: shape: (3072, 768) values: [-0.03710171  0.0648794   0.00758566 -0.05224452 -0.04348791]\n",
      "\n",
      "bert/encoder/layer_0/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
      "TF: shape: (768,) values: [-0.04801027  0.19766568  0.02154854  0.02880666  0.0444298 ]\n",
      "\n",
      "bert/encoder/layer_0/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
      "TF: shape: (768,) values: [-0.10142924 -0.00499344  0.04274083  0.09324206 -0.10700516]\n",
      "\n",
      "bert/encoder/layer_0/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
      "TF: shape: (768,) values: [0.7835125  0.8072406  0.7670588  0.73706394 0.76303864]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
      "TF: shape: (768, 768) values: [ 0.03132744 -0.01340016 -0.07761582  0.0655639  -0.00337808]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
      "TF: shape: (768,) values: [-0.27827993  0.17387655 -0.2497937  -0.8809636   0.41262135]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
      "TF: shape: (768, 768) values: [-0.03353037  0.04007257  0.05320328 -0.02166729 -0.03581231]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
      "TF: shape: (768,) values: [-0.00504407  0.00136887 -0.00394336  0.00646125 -0.00148919]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
      "TF: shape: (768, 768) values: [-0.00464159  0.06674305 -0.00970626 -0.0276653  -0.01597566]\n",
      "\n",
      "bert/encoder/layer_1/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
      "TF: shape: (768,) values: [ 0.00381288  0.02650839 -0.0059689  -0.00508269 -0.01293722]\n",
      "\n",
      "bert/encoder/layer_1/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
      "TF: shape: (768, 768) values: [-0.01390745 -0.01100563  0.01303005 -0.01969771  0.0125082 ]\n",
      "\n",
      "bert/encoder/layer_1/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
      "TF: shape: (768,) values: [0.02946591 0.05715097 0.01293636 0.01920356 0.00805334]\n",
      "\n",
      "bert/encoder/layer_1/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
      "TF: shape: (768,) values: [ 0.08583715  0.14199966 -0.0856637  -0.18797271  0.21056814]\n",
      "\n",
      "bert/encoder/layer_1/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
      "TF: shape: (768,) values: [0.896962   0.87148863 0.8531161  0.8690647  0.9488987 ]\n",
      "\n",
      "bert/encoder/layer_1/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
      "TF: shape: (768, 3072) values: [ 0.01841293 -0.02650284 -0.09708428 -0.01734244 -0.05529237]\n",
      "\n",
      "bert/encoder/layer_1/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
      "TF: shape: (3072,) values: [-0.15203774 -0.10449131 -0.08440229 -0.09323178 -0.08511415]\n",
      "\n",
      "bert/encoder/layer_1/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
      "TF: shape: (3072, 768) values: [-0.02372648  0.03326349  0.08291997 -0.01519038  0.01868557]\n",
      "\n",
      "bert/encoder/layer_1/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
      "TF: shape: (768,) values: [-0.02514724  0.09868994 -0.027811    0.03749462  0.01086514]\n",
      "\n",
      "bert/encoder/layer_1/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
      "TF: shape: (768,) values: [-0.07662535 -0.10506564  0.03191236  0.07633785 -0.11187791]\n",
      "\n",
      "bert/encoder/layer_1/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
      "TF: shape: (768,) values: [0.9017883  0.8868776  0.8862677  0.85865664 0.87496454]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
      "TF: shape: (768, 768) values: [ 0.08433672  0.09580533  0.07543895 -0.01126779 -0.01354045]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
      "TF: shape: (768,) values: [ 0.0371241   0.03406003  0.27713948 -0.21613775 -0.05275448]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
      "TF: shape: (768, 768) values: [ 0.04794507  0.02517631 -0.01319554 -0.02094732  0.09073472]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
      "TF: shape: (768,) values: [-0.00037404 -0.00125881 -0.00114734 -0.00157741  0.00037122]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
      "TF: shape: (768, 768) values: [-0.01119406 -0.01488636 -0.02960914  0.04746444  0.00428481]\n",
      "\n",
      "bert/encoder/layer_2/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
      "TF: shape: (768,) values: [-0.02728729  0.04979054  0.08326469  0.04150949  0.600959  ]\n",
      "\n",
      "bert/encoder/layer_2/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
      "TF: shape: (768, 768) values: [ 0.00517425  0.01197957  0.0393172  -0.0063884  -0.02673388]\n",
      "\n",
      "bert/encoder/layer_2/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
      "TF: shape: (768,) values: [ 0.01754025  0.1226335  -0.05733554  0.06844623  0.00879776]\n",
      "\n",
      "bert/encoder/layer_2/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
      "TF: shape: (768,) values: [ 0.1490809   0.12386955 -0.19382021 -0.26515856  0.32723007]\n",
      "\n",
      "bert/encoder/layer_2/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
      "TF: shape: (768,) values: [0.8983343  0.88877076 0.86283594 0.8584952  0.9587886 ]\n",
      "\n",
      "bert/encoder/layer_2/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
      "TF: shape: (768, 3072) values: [-0.01619919  0.00662888  0.01492284 -0.01280748  0.01318596]\n",
      "\n",
      "bert/encoder/layer_2/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
      "TF: shape: (3072,) values: [-0.08474881 -0.12850781 -0.11550345 -0.09513011 -0.02519853]\n",
      "\n",
      "bert/encoder/layer_2/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
      "TF: shape: (3072, 768) values: [-0.07225161 -0.0129784   0.00618811 -0.01593373 -0.02160194]\n",
      "\n",
      "bert/encoder/layer_2/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
      "TF: shape: (768,) values: [-0.06319264  0.06169628 -0.03041368  0.00924282  0.06277442]\n",
      "\n",
      "bert/encoder/layer_2/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
      "TF: shape: (768,) values: [-0.1139038  -0.11665309  0.07883061  0.07796711 -0.14219187]\n",
      "\n",
      "bert/encoder/layer_2/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
      "TF: shape: (768,) values: [0.8813261  0.85744697 0.8511922  0.85261875 0.8329574 ]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
      "TF: shape: (768, 768) values: [ 0.05855456 -0.00111438 -0.00828963  0.04117409 -0.07591715]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
      "TF: shape: (768,) values: [ 0.09740101 -0.19290674  0.04332267  0.17937997 -0.08023558]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
      "TF: shape: (768, 768) values: [ 0.02562077  0.02507281 -0.03361562  0.05613289 -0.05435724]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
      "TF: shape: (768,) values: [ 0.00188639 -0.00379197 -0.01020415  0.00969649 -0.00094182]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
      "TF: shape: (768, 768) values: [-0.00539032  0.00959642  0.01325458  0.00490616  0.0129908 ]\n",
      "\n",
      "bert/encoder/layer_3/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
      "TF: shape: (768,) values: [0.04573824 0.05405985 0.00681163 0.00655945 0.01141771]\n",
      "\n",
      "bert/encoder/layer_3/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
      "TF: shape: (768, 768) values: [ 0.01850341  0.03148198  0.02705758 -0.0004669   0.01367511]\n",
      "\n",
      "bert/encoder/layer_3/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
      "TF: shape: (768,) values: [ 0.01981483  0.03566506 -0.05016088  0.02958186  0.04989756]\n",
      "\n",
      "bert/encoder/layer_3/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
      "TF: shape: (768,) values: [ 0.09815404  0.00063774 -0.01257733 -0.26485074  0.22568701]\n",
      "\n",
      "bert/encoder/layer_3/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
      "TF: shape: (768,) values: [0.91457725 0.88453823 0.8340887  0.84203583 0.95247847]\n",
      "\n",
      "bert/encoder/layer_3/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
      "TF: shape: (768, 3072) values: [-0.02733567  0.03307878 -0.01331292 -0.00032527  0.03252084]\n",
      "\n",
      "bert/encoder/layer_3/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
      "TF: shape: (3072,) values: [-0.11436842 -0.15038085 -0.07842971  0.01335877 -0.09492484]\n",
      "\n",
      "bert/encoder/layer_3/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
      "TF: shape: (3072, 768) values: [-0.01751153  0.01631314 -0.02660011  0.03569947 -0.01394763]\n",
      "\n",
      "bert/encoder/layer_3/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
      "TF: shape: (768,) values: [-0.03873252  0.08414765 -0.0399323   0.01997361  0.12924597]\n",
      "\n",
      "bert/encoder/layer_3/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
      "TF: shape: (768,) values: [-0.08049371 -0.06923949 -0.03357155  0.05231095 -0.09717073]\n",
      "\n",
      "bert/encoder/layer_3/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
      "TF: shape: (768,) values: [0.827748   0.83012533 0.82399255 0.81772    0.80794513]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
      "TF: shape: (768, 768) values: [ 0.08296382  0.02076941  0.06525186 -0.02659729  0.03491377]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
      "TF: shape: (768,) values: [ 0.07045844 -0.13412629 -0.0514146   0.00061329  0.1248519 ]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
      "TF: shape: (768, 768) values: [ 0.06941643  0.08133814 -0.0453992   0.0668715  -0.06014847]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
      "TF: shape: (768,) values: [-0.00588725 -0.00235185  0.00281131  0.00173088 -0.00546653]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
      "TF: shape: (768, 768) values: [ 0.06889665  0.06645385  0.01232084  0.0132611  -0.01595679]\n",
      "\n",
      "bert/encoder/layer_4/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
      "TF: shape: (768,) values: [-0.01126871 -0.02704018  0.0301532   0.02332082 -0.04233487]\n",
      "\n",
      "bert/encoder/layer_4/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
      "TF: shape: (768, 768) values: [ 0.02285513 -0.04172142 -0.0146292   0.04862929 -0.0442014 ]\n",
      "\n",
      "bert/encoder/layer_4/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
      "TF: shape: (768,) values: [ 0.03054528  0.00479777 -0.02729505 -0.0325212  -0.00525727]\n",
      "\n",
      "bert/encoder/layer_4/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
      "TF: shape: (768,) values: [ 0.00903359  0.0052285  -0.02841488 -0.22355485  0.28281343]\n",
      "\n",
      "bert/encoder/layer_4/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
      "TF: shape: (768,) values: [0.8849676  0.86927813 0.8114595  0.80269504 0.94864094]\n",
      "\n",
      "bert/encoder/layer_4/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
      "TF: shape: (768, 3072) values: [-0.00639783  0.06198016 -0.03184223  0.00485356 -0.02453273]\n",
      "\n",
      "bert/encoder/layer_4/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
      "TF: shape: (3072,) values: [-0.08770327 -0.11779705 -0.11764182 -0.00192611 -0.1335473 ]\n",
      "\n",
      "bert/encoder/layer_4/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
      "TF: shape: (3072, 768) values: [-0.05421264  0.0221118  -0.02674172  0.03672203 -0.02399626]\n",
      "\n",
      "bert/encoder/layer_4/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
      "TF: shape: (768,) values: [-0.05068972  0.04838871  0.01156022  0.05381602  0.08857913]\n",
      "\n",
      "bert/encoder/layer_4/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
      "TF: shape: (768,) values: [-0.04338909 -0.0781464  -0.01518662  0.04936362 -0.12378412]\n",
      "\n",
      "bert/encoder/layer_4/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
      "TF: shape: (768,) values: [0.8734387 0.8576282 0.8339444 0.8450325 0.8105372]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
      "TF: shape: (768, 768) values: [-0.00858843 -0.03920127  0.02552994 -0.02786552  0.02436485]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
      "TF: shape: (768,) values: [-0.00859117 -0.01642405 -0.04391079  0.01085692  0.02925887]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
      "TF: shape: (768, 768) values: [ 0.00352847  0.02330176 -0.00369894 -0.03904612  0.00294574]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
      "TF: shape: (768,) values: [-0.01087186 -0.01176561  0.00016575 -0.01163023  0.00946616]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
      "TF: shape: (768, 768) values: [ 0.06134222  0.04238288  0.02796064 -0.01284983  0.03683741]\n",
      "\n",
      "bert/encoder/layer_5/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
      "TF: shape: (768,) values: [ 0.05061118 -0.02954445 -0.0034053  -0.00025261  0.0437019 ]\n",
      "\n",
      "bert/encoder/layer_5/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
      "TF: shape: (768, 768) values: [-0.00739815  0.0533964  -0.03736389 -0.04999201  0.01693069]\n",
      "\n",
      "bert/encoder/layer_5/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
      "TF: shape: (768,) values: [-0.0021682   0.01711399 -0.04201518  0.01605333  0.00552063]\n",
      "\n",
      "bert/encoder/layer_5/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
      "TF: shape: (768,) values: [-0.06841327 -0.0146848   0.09792476 -0.23284538  0.2785602 ]\n",
      "\n",
      "bert/encoder/layer_5/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
      "TF: shape: (768,) values: [0.8908311  0.87884724 0.81637293 0.8047641  0.96539867]\n",
      "\n",
      "bert/encoder/layer_5/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
      "TF: shape: (768, 3072) values: [-0.03246041  0.07251058 -0.08201726  0.00772481  0.02532209]\n",
      "\n",
      "bert/encoder/layer_5/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
      "TF: shape: (3072,) values: [-0.09689714 -0.27696273 -0.13047501 -0.10892326 -0.1057625 ]\n",
      "\n",
      "bert/encoder/layer_5/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
      "TF: shape: (3072, 768) values: [ 0.0642072  -0.01738782 -0.05095377  0.00523853  0.04425264]\n",
      "\n",
      "bert/encoder/layer_5/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
      "TF: shape: (768,) values: [-0.0007217   0.06006297  0.0016595   0.03848181  0.06703516]\n",
      "\n",
      "bert/encoder/layer_5/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
      "TF: shape: (768,) values: [-0.00278729 -0.05594506 -0.0631047   0.06023621 -0.18672828]\n",
      "\n",
      "bert/encoder/layer_5/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
      "TF: shape: (768,) values: [0.8621183  0.8515807  0.82654256 0.81729776 0.7985204 ]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
      "TF: shape: (768, 768) values: [-0.02527807 -0.01429243  0.01467054  0.08624706 -0.00188593]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
      "TF: shape: (768,) values: [-0.17319514  0.27564248  0.16801168 -0.10946485  0.1643271 ]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
      "TF: shape: (768, 768) values: [ 0.05886372  0.00706217  0.0398422   0.00882155 -0.04571463]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
      "TF: shape: (768,) values: [-0.00424696 -0.0001192   0.0046079  -0.00315606  0.00434314]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
      "TF: shape: (768, 768) values: [-0.01720381  0.01170722  0.02346902 -0.02284313 -0.03173028]\n",
      "\n",
      "bert/encoder/layer_6/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
      "TF: shape: (768,) values: [-0.03492057  0.01813157 -0.00182878 -0.01420629 -0.00508944]\n",
      "\n",
      "bert/encoder/layer_6/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
      "TF: shape: (768, 768) values: [ 0.0323688  -0.00689882  0.07379091  0.01121114 -0.02059202]\n",
      "\n",
      "bert/encoder/layer_6/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
      "TF: shape: (768,) values: [-0.00648672 -0.05935453 -0.05673229 -0.01152384 -0.02766573]\n",
      "\n",
      "bert/encoder/layer_6/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
      "TF: shape: (768,) values: [-0.06793639  0.03157783  0.15647687 -0.15025291  0.14727171]\n",
      "\n",
      "bert/encoder/layer_6/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
      "TF: shape: (768,) values: [0.8882361  0.8704905  0.80289173 0.77365315 0.92333615]\n",
      "\n",
      "bert/encoder/layer_6/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
      "TF: shape: (768, 3072) values: [ 0.04492201  0.05160861  0.09041415 -0.00742628  0.048133  ]\n",
      "\n",
      "bert/encoder/layer_6/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
      "TF: shape: (3072,) values: [-0.09301704 -0.158612   -0.10633879 -0.09706812 -0.17319229]\n",
      "\n",
      "bert/encoder/layer_6/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
      "TF: shape: (3072, 768) values: [-0.00085372 -0.00974195  0.00684915  0.00038686  0.06610142]\n",
      "\n",
      "bert/encoder/layer_6/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
      "TF: shape: (768,) values: [-0.03254414  0.05681704  0.03720434  0.01936359  0.09134153]\n",
      "\n",
      "bert/encoder/layer_6/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
      "TF: shape: (768,) values: [-0.0117129  -0.03209404 -0.08646043  0.03760341 -0.13841423]\n",
      "\n",
      "bert/encoder/layer_6/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
      "TF: shape: (768,) values: [0.8674175  0.8657014  0.8151861  0.82301307 0.8305737 ]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
      "TF: shape: (768, 768) values: [-0.00075523 -0.01501983  0.04090893  0.01884826  0.04670674]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
      "TF: shape: (768,) values: [ 0.0010344  -0.00423982  0.3117479   0.04494623 -0.01260845]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
      "TF: shape: (768, 768) values: [ 0.02781927 -0.00906972  0.02121989  0.0298591   0.05854786]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
      "TF: shape: (768,) values: [-0.00074918  0.00731079  0.00089338  0.00345652  0.00043817]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
      "TF: shape: (768, 768) values: [-0.01080035 -0.03468366  0.03167168  0.01583073  0.0327719 ]\n",
      "\n",
      "bert/encoder/layer_7/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
      "TF: shape: (768,) values: [-0.02824226  0.01605172  0.00067929 -0.04553111  0.0076044 ]\n",
      "\n",
      "bert/encoder/layer_7/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
      "TF: shape: (768, 768) values: [-0.05496112  0.01006968  0.02206531 -0.01873116  0.02149118]\n",
      "\n",
      "bert/encoder/layer_7/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
      "TF: shape: (768,) values: [ 0.00349772 -0.05831751 -0.0594084  -0.0342187   0.02965918]\n",
      "\n",
      "bert/encoder/layer_7/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
      "TF: shape: (768,) values: [-0.02826844  0.04427591  0.05678326 -0.0475907   0.16136196]\n",
      "\n",
      "bert/encoder/layer_7/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
      "TF: shape: (768,) values: [0.8742141  0.870608   0.79147685 0.7595279  0.9223656 ]\n",
      "\n",
      "bert/encoder/layer_7/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
      "TF: shape: (768, 3072) values: [ 0.03598932 -0.12225644  0.03019998  0.05691092  0.03717208]\n",
      "\n",
      "bert/encoder/layer_7/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
      "TF: shape: (3072,) values: [-0.12465011 -0.08639494 -0.06206005 -0.08012587 -0.08773767]\n",
      "\n",
      "bert/encoder/layer_7/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
      "TF: shape: (3072, 768) values: [-0.02190432 -0.02279165  0.03279508  0.01011065 -0.07793335]\n",
      "\n",
      "bert/encoder/layer_7/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
      "TF: shape: (768,) values: [-0.04282642  0.03700675  0.06142357 -0.04787201  0.02958163]\n",
      "\n",
      "bert/encoder/layer_7/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
      "TF: shape: (768,) values: [-0.03142036 -0.04358427 -0.05132087 -0.01788123 -0.16399944]\n",
      "\n",
      "bert/encoder/layer_7/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
      "TF: shape: (768,) values: [0.83858097 0.8179645  0.80693793 0.81225365 0.7844832 ]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
      "TF: shape: (768, 768) values: [0.0448719  0.02289526 0.03083764 0.03048073 0.02436891]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
      "TF: shape: (768,) values: [-0.25132924 -0.23753347  0.02581017  0.00901509  0.18424493]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
      "TF: shape: (768, 768) values: [-0.01999719  0.00711403  0.03949134 -0.0102224   0.03152475]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
      " -4.4074579e-04]\n",
      "TF: shape: (768,) values: [ 5.5668897e-05  3.4638541e-03 -1.7605867e-03 -6.1321147e-03\n",
      " -4.4074579e-04]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
      "TF: shape: (768, 768) values: [-0.00736056 -0.01795213  0.00104576 -0.00034653  0.03190543]\n",
      "\n",
      "bert/encoder/layer_8/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
      "TF: shape: (768,) values: [ 0.02892835  0.00642501 -0.03608712  0.00264269 -0.0245198 ]\n",
      "\n",
      "bert/encoder/layer_8/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
      "TF: shape: (768, 768) values: [ 0.03971623  0.05307067 -0.01298818  0.00946693 -0.00121235]\n",
      "\n",
      "bert/encoder/layer_8/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
      "TF: shape: (768,) values: [ 0.01468131 -0.05406622 -0.06289103  0.004484    0.0240819 ]\n",
      "\n",
      "bert/encoder/layer_8/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
      "TF: shape: (768,) values: [-0.06004262  0.0457275   0.08688109 -0.14416659 -0.05500487]\n",
      "\n",
      "bert/encoder/layer_8/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
      "TF: shape: (768,) values: [0.8907534  0.89116573 0.811639   0.7810443  0.9045574 ]\n",
      "\n",
      "bert/encoder/layer_8/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
      "TF: shape: (768, 3072) values: [-0.01962814 -0.01482586 -0.02292624  0.03397145  0.02457482]\n",
      "\n",
      "bert/encoder/layer_8/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
      "TF: shape: (3072,) values: [-0.08129632 -0.1691108  -0.10681771 -0.10392351 -0.13120006]\n",
      "\n",
      "bert/encoder/layer_8/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
      "TF: shape: (3072, 768) values: [-0.04683433 -0.02690669  0.02979059  0.02223369 -0.00130287]\n",
      "\n",
      "bert/encoder/layer_8/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
      "TF: shape: (768,) values: [-0.09155537 -0.04465394  0.05649116 -0.09628641  0.11875238]\n",
      "\n",
      "bert/encoder/layer_8/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
      "TF: shape: (768,) values: [-0.06043394 -0.06657387 -0.05341128 -0.00374733 -0.10855272]\n",
      "\n",
      "bert/encoder/layer_8/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
      "TF: shape: (768,) values: [0.84467345 0.84421015 0.82582206 0.84553087 0.8207573 ]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
      "TF: shape: (768, 768) values: [ 0.08004542 -0.0143706  -0.04219061 -0.05175152 -0.01147588]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
      "TF: shape: (768,) values: [-0.14508031  0.40926442 -0.3281781  -0.02869792 -0.26104516]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
      "TF: shape: (768, 768) values: [-0.01337681  0.00615428 -0.0455939   0.03379053 -0.01992556]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
      "TF: shape: (768,) values: [-0.0051302   0.0083288   0.00377641  0.00928865 -0.00418182]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
      "TF: shape: (768, 768) values: [-0.02485976 -0.0301923   0.00984638 -0.02495162  0.01074037]\n",
      "\n",
      "bert/encoder/layer_9/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
      "TF: shape: (768,) values: [-0.04229928 -0.02636711  0.0060447   0.00222829  0.04979481]\n",
      "\n",
      "bert/encoder/layer_9/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
      "TF: shape: (768, 768) values: [-0.01258144  0.00871274  0.00482882 -0.00675888 -0.04390825]\n",
      "\n",
      "bert/encoder/layer_9/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
      "TF: shape: (768,) values: [ 0.02457753  0.05051134 -0.06890804 -0.00962795  0.00864793]\n",
      "\n",
      "bert/encoder/layer_9/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
      "TF: shape: (768,) values: [-0.08963391 -0.06362236  0.0676669  -0.09895685  0.08318913]\n",
      "\n",
      "bert/encoder/layer_9/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
      "TF: shape: (768,) values: [0.85100883 0.82569736 0.7927931  0.7660444  0.8912934 ]\n",
      "\n",
      "bert/encoder/layer_9/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
      "TF: shape: (768, 3072) values: [ 0.06290598  0.0203122  -0.05384256  0.05442941  0.00484769]\n",
      "\n",
      "bert/encoder/layer_9/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
      "TF: shape: (3072,) values: [-0.10818483 -0.00169527 -0.08962701 -0.10280421 -0.14310956]\n",
      "\n",
      "bert/encoder/layer_9/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
      "TF: shape: (3072, 768) values: [ 0.05487705  0.01644666  0.00436198 -0.00490768 -0.03238423]\n",
      "\n",
      "bert/encoder/layer_9/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
      "TF: shape: (768,) values: [-0.08755219 -0.01910074 -0.02988298 -0.08150438  0.09897955]\n",
      "\n",
      "bert/encoder/layer_9/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
      "TF: shape: (768,) values: [-0.04136161 -0.02113917 -0.07581077 -0.00809791 -0.09790538]\n",
      "\n",
      "bert/encoder/layer_9/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
      "TF: shape: (768,) values: [0.8250572  0.83477134 0.7794141  0.81264955 0.7827918 ]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
      "TF: shape: (768, 768) values: [ 0.00071212 -0.00853064  0.01776993  0.03189976  0.02183623]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
      "TF: shape: (768,) values: [-0.03667567 -0.01449654 -0.03822913  0.00118343 -0.05489838]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
      "TF: shape: (768, 768) values: [-0.0494106   0.05531096 -0.02459413 -0.06019118 -0.02829785]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
      "TF: shape: (768,) values: [-0.00692997  0.00855893  0.00670777 -0.0052475  -0.00017074]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
      "TF: shape: (768, 768) values: [ 0.01911842  0.04858809 -0.02608485  0.00794924 -0.02246636]\n",
      "\n",
      "bert/encoder/layer_10/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
      "TF: shape: (768,) values: [-0.0133503  -0.01224133 -0.0051834  -0.00232528  0.00148614]\n",
      "\n",
      "bert/encoder/layer_10/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
      "TF: shape: (768, 768) values: [-0.05904732  0.02616     0.00794104 -0.02889086 -0.03692576]\n",
      "\n",
      "bert/encoder/layer_10/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
      "TF: shape: (768,) values: [0.02089205 0.01458059 0.05217785 0.0324267  0.00907548]\n",
      "\n",
      "bert/encoder/layer_10/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
      "TF: shape: (768,) values: [-0.10986238 -0.04332284  0.02603893 -0.06236923  0.14469369]\n",
      "\n",
      "bert/encoder/layer_10/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
      "TF: shape: (768,) values: [0.8515822  0.81392974 0.836747   0.78040504 0.88091415]\n",
      "\n",
      "bert/encoder/layer_10/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
      "TF: shape: (768, 3072) values: [-0.07061081  0.06997397  0.01433633  0.04150929  0.02865192]\n",
      "\n",
      "bert/encoder/layer_10/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
      "TF: shape: (3072,) values: [-0.13879126 -0.06401426 -0.1408043  -0.15043251 -0.10193057]\n",
      "\n",
      "bert/encoder/layer_10/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
      "TF: shape: (3072, 768) values: [ 0.02918765  0.02609882 -0.02259856  0.01636725 -0.00038442]\n",
      "\n",
      "bert/encoder/layer_10/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
      "TF: shape: (768,) values: [-0.01799502  0.10970547 -0.02384165 -0.03350981  0.10491351]\n",
      "\n",
      "bert/encoder/layer_10/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
      "TF: shape: (768,) values: [ 0.00999107 -0.0217309  -0.0854177  -0.01109101 -0.07902174]\n",
      "\n",
      "bert/encoder/layer_10/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
      "TF: shape: (768,) values: [0.8272796  0.8597452  0.79116803 0.81267637 0.8273501 ]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/query/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
      "TF: shape: (768, 768) values: [-0.04141425 -0.06491017 -0.03202523  0.06226195  0.02193764]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/query/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
      "TF: shape: (768,) values: [ 0.0501296   0.11886728  0.2186807   0.08720991 -0.20476632]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/key/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
      "TF: shape: (768, 768) values: [ 0.02634268 -0.01357682 -0.06076496  0.04210597  0.01783857]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/key/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
      "TF: shape: (768,) values: [-0.0007798  -0.00065806 -0.00010521  0.00119144 -0.00180091]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/value/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
      "TF: shape: (768, 768) values: [ 0.03520973 -0.00678078 -0.02883583 -0.01011515  0.04519828]\n",
      "\n",
      "bert/encoder/layer_11/attention/self/value/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
      "TF: shape: (768,) values: [ 0.01502306 -0.00530942  0.00023572  0.00205218 -0.00578036]\n",
      "\n",
      "bert/encoder/layer_11/attention/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
      "TF: shape: (768, 768) values: [ 0.02361419  0.03112707 -0.00063031  0.04209773 -0.02434015]\n",
      "\n",
      "bert/encoder/layer_11/attention/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
      "TF: shape: (768,) values: [ 0.02566087  0.0028438  -0.00475678  0.02149458 -0.01755187]\n",
      "\n",
      "bert/encoder/layer_11/attention/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
      "TF: shape: (768,) values: [-0.03134411  0.01207957 -0.04636396 -0.03013046  0.07944281]\n",
      "\n",
      "bert/encoder/layer_11/attention/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
      "TF: shape: (768,) values: [0.85203767 0.8020145  0.8554237  0.8150477  0.8441815 ]\n",
      "\n",
      "bert/encoder/layer_11/intermediate/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
      "TF: shape: (768, 3072) values: [ 0.05871898 -0.01124212  0.00206979 -0.04366514 -0.00716808]\n",
      "\n",
      "bert/encoder/layer_11/intermediate/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
      "TF: shape: (3072,) values: [-0.09762521 -0.06175711 -0.05153917 -0.08580919 -0.09734315]\n",
      "\n",
      "bert/encoder/layer_11/output/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
      "TF: shape: (3072, 768) values: [-0.022382    0.01073206 -0.01357213  0.02484621  0.01403091]\n",
      "\n",
      "bert/encoder/layer_11/output/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
      "TF: shape: (768,) values: [-0.06574099  0.04207807  0.01201084  0.00229322  0.05551811]\n",
      "\n",
      "bert/encoder/layer_11/output/LayerNorm/beta\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
      "TF: shape: (768,) values: [-0.00634605 -0.01989403  0.04628465  0.01585056 -0.04256899]\n",
      "\n",
      "bert/encoder/layer_11/output/LayerNorm/gamma\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
      "TF: shape: (768,) values: [0.6384234  0.6300364  0.66570055 0.6126921  0.63756436]\n",
      "\n",
      "bert/pooler/dense/kernel\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
      "TF: shape: (768, 768) values: [-0.00127425  0.00199868 -0.03863145 -0.00139355  0.00691627]\n",
      "\n",
      "bert/pooler/dense/bias\n",
      "|sum(pt_wts - tf_wts)| = 0.0\n",
      "PT: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
      "TF: shape: (768,) values: [-0.03597581 -0.00389536  0.05181352  0.02224747 -0.00493723]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tensors_to_transopse = (\n",
    "    \"dense.weight\",\n",
    "    \"attention.self.query\",\n",
    "    \"attention.self.key\",\n",
    "    \"attention.self.value\"\n",
    ")\n",
    "var_map = (\n",
    "    ('layer.', 'layer_'),\n",
    "    ('word_embeddings.weight', 'word_embeddings'),\n",
    "    ('position_embeddings.weight', 'position_embeddings'),\n",
    "    ('token_type_embeddings.weight', 'token_type_embeddings'),\n",
    "    ('.', '/'),\n",
    "    ('LayerNorm/weight', 'LayerNorm/gamma'),\n",
    "    ('LayerNorm/bias', 'LayerNorm/beta'),\n",
    "    ('weight', 'kernel')\n",
    ")\n",
    "\n",
    "def to_tf_var_name(name:str):\n",
    "    for patt, repl in iter(var_map):\n",
    "        name = name.replace(patt, repl)\n",
    "    return 'bert/{}'.format(name)\n",
    "\n",
    "tf_vars = {v.name: session.run(fetches=v) for v in tf.global_variables()}\n",
    "pt_vars = {}\n",
    "for v, T in pt_model.state_dict().items():\n",
    "    T = T.detach().numpy()\n",
    "    if any([x in v for x in tensors_to_transopse]):\n",
    "        T = T.T\n",
    "    pt_vars.update({to_tf_var_name(v): T})\n",
    "\n",
    "for var_name in tf_vars:\n",
    "    \n",
    "    pt = pt_vars[var_name.strip(\":0\")]\n",
    "    tf = tf_vars[var_name]\n",
    "\n",
    "    print(var_name.strip(\":0\"))\n",
    "    \n",
    "    # Assert equivalence\n",
    "    print(\"|sum(pt_wts - tf_wts)| = {}\".format(\n",
    "        np.abs(np.sum(pt - tf, keepdims=False))\n",
    "    ))\n",
    "    assert not np.sum(pt - tf, keepdims=False)\n",
    "    \n",
    "    if len(pt.shape) == 2:\n",
    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[0, :5]))\n",
    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[0, :5]))\n",
    "    else:\n",
    "        print(\"PT: shape: {0} values: {1}\".format(pt.shape, pt[:5]))\n",
    "        print(\"TF: shape: {0} values: {1}\".format(tf.shape, tf[:5]))\n",
    "    print()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compare Layer-12 Projections"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MSE: 2.7155439966009e-05\n",
      "PT-values: [-0.876663   -0.41088238 -0.12200808  0.44941     0.19445966]\n",
      "TF-values: [-0.8742865  -0.40621698 -0.10585472  0.444904    0.1825743 ]\n"
     ]
    }
   ],
   "source": [
    "# Mean Squared Error (MSE) between last projection of each model\n",
    "MSE = np.mean((pt_embedding - tf_embedding) ** 2, keepdims=False)\n",
    "print(\"MSE: {}\".format(MSE))\n",
    "print(\"PT-values: {}\".format(pt_embedding[0, :5]))\n",
    "print(\"TF-values: {}\".format(tf_embedding[0, :5]))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlp",
   "language": "python",
   "name": "nlp"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
